last updated: 2023-09-27
As usual, make sure we have the right packages for this exercise
if (!require("pacman")) install.packages("pacman"); library(pacman)
# let's load all of the files we were using and want to have again today
p_load("tidyverse", "knitr", "readr",
"pander", "BiocManager",
"dplyr", "stringr",
"purrr", # for working with lists (beautify column names)
"scales", "viridis", # for ggplot
"reactable") # for pretty tables.
# We also need these packages today.
p_load("DESeq2", "edgeR", "AnnotationDbi", "org.Sc.sgd.db",
"ggrepel",
"Glimma",
"ggVennDiagram", "ggplot2")
This exercises shows more ways differential expression analysis data can be visualized.
At the end of this exercise, you should be able to:
library(org.Sc.sgd.db)
# load in all of the DE results for the difference of difference contrast
path_output_edgeR <- "~/Desktop/Genomic_Data_Analysis/Analysis/edgeR/yeast_topTags_edgeR.tsv"
path_output_DESeq2 <- "~/Desktop/Genomic_Data_Analysis/Analysis/DESeq2/yeast_res_DESeq2.tsv"
path_output_limma <- "~/Desktop/Genomic_Data_Analysis/Analysis/limma/yeast_topTags_limma.tsv"
# if you don't have these files, we generated them in previous lessons.
# you can remove the "#" from the chunks below to grab them from the interwebs.
# path_output_edgeR <- "https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_topTags_edgeR.tsv"
# path_output_DESeq2 <- "https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_res_DESeq2.tsv"
# path_output_limma <- "https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_topTags_limma.tsv"
topTags_edgeR <- read_tsv(path_output_edgeR)
## Rows: 5615 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): ORF, SGD, GENENAME
## dbl (5): logFC, logCPM, F, PValue, FDR
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
topTags_DESeq2 <- read_tsv(path_output_DESeq2)
## Rows: 5622 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): ORF, GENENAME, SGD
## dbl (6): baseMean, log2FoldChange, lfcSE, stat, pvalue, padj
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
topTags_limma <- read_tsv(path_output_limma)
## Rows: 5615 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): ORF, SGD, GENENAME
## dbl (6): logFC, AveExpr, t, P.Value, adj.P.Val, B
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sig_cutoff <- 0.05
FC_cutoff <- 1
## edgeR
# get genes that are upregualted
up_edgeR_DEG <- topTags_edgeR %>%
dplyr::filter(FDR < sig_cutoff & logFC > FC_cutoff) %>%
pull(ORF)
down_edgeR_DEG <- topTags_edgeR %>%
dplyr::filter(FDR < sig_cutoff & logFC < -FC_cutoff) %>%
pull(ORF)
## DESeq2
up_DESeq2_DEG <- topTags_DESeq2 %>%
dplyr::filter(padj < sig_cutoff & log2FoldChange > FC_cutoff) %>%
pull(ORF)
down_DESeq2_DEG <- topTags_DESeq2 %>%
dplyr::filter(padj < sig_cutoff & log2FoldChange < -FC_cutoff) %>%
pull(ORF)
## limma
up_limma_DEG <- topTags_limma %>%
dplyr::filter(adj.P.Val < sig_cutoff & logFC > FC_cutoff) %>%
pull(ORF)
down_limma_DEG <- topTags_limma %>%
dplyr::filter(adj.P.Val < sig_cutoff & logFC < -FC_cutoff) %>%
pull(ORF)
up_DEG_results_list <- list(up_edgeR_DEG,
up_DESeq2_DEG,
up_limma_DEG)
MA plots display a log ratio (M) vs an average (A) in order to
visualize the differences between two groups. In general we would expect
the expression of genes to remain consistent between conditions and so
the MA plot should be similar to the shape of a trumpet with most points
residing on a y intercept of 0. DESeq2 has a built in function for
creating the MA plot that we have used before (plotMA()),
but we can also make our own:
# assign pvalue and logFC cutoffs for coloring DE genes
sig_cutoff <- 0.01
FC_label_cutoff <- 3
#plot MA for edgeR using ggplot2
topTags_edgeR %>%
mutate(`Significant FDR` = case_when(
FDR < sig_cutoff ~ "Yes",
.default = "No"),
delabel = case_when(FDR < sig_cutoff & abs(logFC) > FC_label_cutoff ~ ORF,
.default = NA)) %>%
ggplot(aes(x=logCPM, y=logFC, color = `Significant FDR`, label = delabel)) +
geom_point(size=1) +
scale_y_continuous(limits=c(-5, 5), oob=squish) +
geom_hline(yintercept = 0, colour="darkgrey", linewidth=1, linetype="longdash") +
labs(x="mean of normalized counts", y="log fold change") +
# ggrepel::geom_text_repel(size = 1.5) +
scale_color_manual(values = c("black", "red")) +
theme_bw() +
ggtitle("edgeR MA plot")
#plot MA for DESeq2 using ggplot2
topTags_DESeq2 %>%
mutate(
`Significant FDR` = case_when(padj < sig_cutoff ~ "Yes",
.default = "No"),
delabel = case_when(
padj < sig_cutoff & abs(log2FoldChange) > FC_label_cutoff ~ ORF,
.default = NA)
) %>%
ggplot(aes(log(baseMean), log2FoldChange, color = `Significant FDR`, label = delabel)) +
geom_point(size=1) +
scale_y_continuous(limits=c(-5, 5), oob=squish) +
geom_hline(yintercept = 0, colour="darkgrey", linewidth=1, linetype="longdash") +
labs(x="mean of normalized counts", y="log fold change") +
# ggrepel::geom_text_repel(size = 1.5) +
scale_color_manual(values = c("black", "red")) +
theme_bw() +
ggtitle("DESeq2 MA plot")
#plot MA for limma using ggplot2
topTags_limma %>%
mutate(
`Significant FDR` = case_when(adj.P.Val < sig_cutoff ~ "Yes",
.default = "No"),
delabel = case_when(
adj.P.Val < sig_cutoff & abs(logFC) > FC_label_cutoff ~ ORF,
.default = NA)
) %>%
ggplot(aes(AveExpr, logFC, color = `Significant FDR`, label = delabel)) +
geom_point(size=1) +
scale_y_continuous(limits=c(-5, 5), oob=squish) +
geom_hline(yintercept = 0, colour="darkgrey", linewidth=1, linetype="longdash") +
labs(x="mean of normalized counts", y="log fold change") +
# ggrepel::geom_text_repel(size = 1.5) +
scale_color_manual(values = c("black", "red")) +
theme_bw() +
ggtitle("limma MA plot")
# change the dimensions of the output figure by clicking the gear icon in topright corner of the code chunk > "use custom figure size"
topTags_edgeR %>%
mutate(`Significant FDR` = case_when(
FDR < sig_cutoff ~ "Yes",
.default = "No"),
delabel = case_when(FDR < sig_cutoff & abs(logFC) > FC_label_cutoff ~ ORF,
.default = NA)) %>%
ggplot(aes(x = logFC, -log10(FDR), color = `Significant FDR`, label = delabel)) +
geom_point(size = 1) +
ggrepel::geom_text_repel(size = 1.5) +
labs(x = "log fold change", y = "-log10(adjusted p-value)") +
theme_bw() +
guides(color="none") +
scale_color_manual(values = c("black", "red")) +
ggtitle("edgeR Volcano plot")
## Warning: Removed 5556 rows containing missing values (`geom_text_repel()`).
topTags_DESeq2 %>%
mutate(
`Significant FDR` = case_when(padj < sig_cutoff ~ "Yes",
.default = "No"),
delabel = case_when(
padj < sig_cutoff & abs(log2FoldChange) > FC_label_cutoff ~ ORF,
.default = NA)
) %>%
ggplot(aes(log2FoldChange,-log10(padj), color = `Significant FDR`, label = delabel)) +
geom_point(size = 1) +
ggrepel::geom_text_repel(size = 1.5) +
labs(x = "log fold change", y = "-log10(adjusted p-value)") +
theme_bw() +
guides(color="none") +
scale_color_manual(values = c("black", "red")) +
ggtitle("DESeq2 Volcano plot")
## Warning: Removed 5559 rows containing missing values (`geom_text_repel()`).
## Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
topTags_limma %>%
mutate(
`Significant FDR` = case_when(adj.P.Val < sig_cutoff ~ "Yes",
.default = "No"),
delabel = case_when(
adj.P.Val < sig_cutoff & abs(logFC) > FC_label_cutoff ~ ORF,
.default = NA)
) %>%
ggplot(aes(x=logFC, y=-log10(P.Value), color = `Significant FDR`, label = delabel)) +
geom_point(size = 1) +
ggrepel::geom_text_repel(size = 1.5) +
labs(x = "log fold change", y = "-log10(UNADJUSTED p-value)") +
theme_bw() +
guides(color="none") +
scale_color_manual(values = c("black", "red")) +
ggtitle("limma Volcano plot")
## Warning: Removed 5557 rows containing missing values (`geom_text_repel()`).
# load in res objects for both limma and edgeR
res_limma <- readRDS("~/Desktop/Genomic_Data_Analysis/Analysis/limma/yeast_res_limma.Rds")
res_edgeR <- readRDS("~/Desktop/Genomic_Data_Analysis/Analysis/edgeR/yeast_res_edgeR.Rds")
# code to pull it from github:
# res_limma <- read_rds("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_res_limma.Rds")
# res_edgeR <- read_rds("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_res_edgeR.Rds")
# load in the DGE lists for each
y_limma <- readRDS("~/Desktop/Genomic_Data_Analysis/Analysis/limma/yeast_y_limma.Rds")
y_edgeR <- readRDS("~/Desktop/Genomic_Data_Analysis/Analysis/edgeR/yeast_y_edgeR.Rds")
# again, alternative code to pull from github
# y_limma <- read_rds("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_y_limma.Rds")
# y_edgeR <- read_rds("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_y_edgeR.Rds")
glimmaMA(res_limma, dge = y_limma)
glimmaMA(res_edgeR, dge = y_edgeR)
## Warning in buildXYData(table, status, main, display.columns, anno, counts, :
## count transform requested but not all count values are integers.
glimmaVolcano(res_limma)
glimmaVolcano(res_edgeR)
This visualization approach compresses relevant information, so it’s generally a discouraged approach for visualizing DE data. However, it is done, so if it is useful for your study, here is how you could do it.
# let's use the res_all object from the 08_DE_limma exercise:
res_all_limma <- read_rds('https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_res_allContrasts_limma.Rds')
decideTests_all_edgeR <- read_tsv('https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/analysis/yeast_decideTests_allContrasts_edgeR.tsv')
## Rows: 5615 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): gene
## dbl (5): -1*WT.unstressed 1*WT.EtOH, -1*msn24dd.unstressed 1*msn24dd.EtOH, -...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
res_all_limma %>%
decideTests(p.value = 0.05, lfc = 0) %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
pivot_longer(-gene, names_to = "contrast", values_to = "DE_direction") %>%
group_by(contrast) %>%
summarise(
upregulated = sum(DE_direction == 1),
downregulated = sum(DE_direction == -1)
) %>%
pivot_longer(-contrast, names_to = "DE_direction", values_to = "n_genes") %>%
ggplot(aes(x = contrast, y = n_genes, fill = DE_direction)) +
geom_col(position = "dodge") +
theme_bw() +
coord_flip() +
geom_text(aes(label = n_genes),
position = position_dodge(width = .9),
hjust = "inward") +
labs(y="Number of DE genes") +
ggtitle("Summary of DE genes by contrast (limma)")
# how to do the same for edgeR
decideTests_all_edgeR %>%
pivot_longer(-gene, names_to = "contrast", values_to = "DE_direction") %>%
group_by(contrast) %>%
summarise(
upregulated = sum(DE_direction == 1),
downregulated = sum(DE_direction == -1)
) %>%
pivot_longer(-contrast, names_to = "DE_direction", values_to = "n_genes") %>%
mutate(contrast = fct_reorder(contrast, 1/(1+n_genes))) %>%
ggplot(aes(x = contrast, y = n_genes, fill = DE_direction)) +
geom_col(position = "dodge") +
theme_bw() +
coord_flip() +
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
geom_text(aes(label = n_genes),
position = position_dodge(width = .9),
hjust = "inward") +
labs(y="Number of DE genes") +
ggtitle("Summary of DE genes by contrast (edgeR)")
If we want to show the same amount of information, in a more informative way, a venn diagram is often a better alternative. Here’s an easy way to get that visualization if you use either edgeR or limma for your analysis.
# same as before, we can make the plot from the decideTests output
res_all_limma %>%
decideTests(p.value = 0.01, lfc = 0) %>%
vennDiagram(include=c("up", "down"),
lwd=0.75,
mar=rep(2,4), # increase margin size
counts.col= c("red", "blue"),
show.include=TRUE)
decideTests_all_edgeR %>%
column_to_rownames("gene") %>%
vennDiagram(include=c("up", "down"),
lwd=0.75,
mar=rep(4,4), # increase margin size
counts.col= c("red", "blue"),
show.include=TRUE)
Venn diagrams are useful for showing gene counts as well as there overlaps between contrasts. A useful gui based web-page for creating venn diagrams inclues: https://eulerr.co/. If you enjoy coding, it also exists as an R package (https://cran.r-project.org/web/packages/eulerr/index.html).
# here are all of the contrasts
colnames(res_all_limma)
## [1] "EtOHvsMOCK.WT" "EtOHvsMOCK.MSN24dd" "EtOH.MSN24ddvsWT"
## [4] "MOCK.MSN24ddvsWT" "EtOHvsWT.MSN24ddvsWT"
# select the correct two and replace them below
res_all_limma %>%
decideTests(p.value = 0.05, lfc = 0) %>%
data.frame() %>%
# change the columns selected in this select command
dplyr::select(c("MOCK.MSN24ddvsWT", "EtOH.MSN24ddvsWT")) %>%
vennDiagram(include="down",
lwd=0.75,
mar=rep(0,4), # increase margin size
# counts.col= c("red", "blue"),
show.include=TRUE
)